Pipelining becomes powerful with GridSearchCV


In [ ]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.grid_search import GridSearchCV
import numpy as np

In [ ]:
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split


iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y)

The wrong way to do GridSearchCV with preprocessing:


In [ ]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

scaler = StandardScaler()
X_preprocessed = scaler.fit_transform(X_train)
param_grid = {'C': 10. ** np.arange(-3, 3), 'gamma': 10. ** np.arange(-3, 3)}

grid = GridSearchCV(SVC(), param_grid=param_grid, cv=5)

The right way to do GridSearchCV with preprocessing


In [ ]:
from sklearn.pipeline import make_pipeline

param_grid_pipeline = {'svc__C': 10. ** np.arange(-3, 3), 'svc__gamma': 10. ** np.arange(-3, 3)}

scaler_pipe = make_pipeline(StandardScaler(), SVC())
grid = GridSearchCV(scaler_pipe, param_grid=param_grid_pipeline, cv=5)

In [ ]:
grid.fit(X_train, y_train)
print(grid.best_params_)

In [ ]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest


param_grid = {'selectkbest__k': [1, 2, 3, 4], 'svc__C': 10. ** np.arange(-3, 3), 'svc__gamma': 10. ** np.arange(-3, 3)}

scaler_pipe = make_pipeline(SelectKBest(), SVC())
grid = GridSearchCV(scaler_pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)
print(grid.best_params_)

In [ ]:
text_pipe = make_pipeline(TfidfVectorizer(), LinearSVC())
param_grid = {'tfidifvectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)], 'linearsvc__C': 10. ** np.arange(-3, 3)}

grid = GridSearchCV(text_pipe, param_grid=param_grid, cv=5)